• NMF on Gene expression and Chromatin accessibility data
    • Gene expression (RNAseq)
      • Data loading
      • Applying NMF
      • Factorization quality metrics and optimal K
      • H Matrix, W normalized:
      • Gene set enrichment analysis
    • Chromatin accessibility (ATACseq)
      • Data loading
      • Applying NMF
      • Factorization quality metrics and optimal K
      • H Matrix, W normalized:
    • Integrative RNAseq & ATACseq
      • Data loading
      • Applying integrative NMF
      • Factorization quality metrics and optimal K
      • H Matrices:
      • Gene set enrichment analysis
    • iNMF Recovery plots:

NMF on Gene expression and Chromatin accessibility data

# Helper functions 
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Compute K stats and                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
my.kstats <- function(NMFexperiment){
  # calc different k stats
  NMFexperiment <- computeFrobErrorStats(NMFexperiment)
  NMFexperiment <- computeSilhoutteWidth(NMFexperiment)
  NMFexperiment <- computeCopheneticCoeff(NMFexperiment)
  NMFexperiment <- computeAmariDistances(NMFexperiment)
  return(NMFexperiment)
}

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Plot K stats                                    ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
my.plotKstats <- function(NMFexperiment, title){
  # visualize k stats
  gg.optKr <- plotKStats(NMFexperiment)
  gg.optKr <- gg.optKr + theme_bw() + 
    ggtitle(title) +
    theme(plot.title=element_text(hjust=0.5))
  return(gg.optKr)
}


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Plot H matrix heatmap                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

my.plotHMatrices <- function(NMFexperiment, heat.anno, col=viridis(n=100)){
  for(ki in names(NMFexperiment@HMatrixList)) {
  cat("\n")
  cat("  \n#### H matrix for k=",  ki, "  \n  ")
  #plot H matrix
  tmp.hmatrix <- HMatrix(NMFexperiment, k = ki)
  colnames(tmp.hmatrix) <- colnames(NMFexperiment)
  h.heatmap <- Heatmap(tmp.hmatrix,
                       col = col,
                       name = "Exposure",
                       clustering_distance_columns = 'pearson',
                       show_column_dend = TRUE,
                       heatmap_legend_param = 
                         list(color_bar = "continuous", legend_height=unit(2, "cm")),
                       top_annotation = heat.anno,
                       show_column_names = TRUE,
                       show_row_names = FALSE,
                       cluster_rows = FALSE)
  print(h.heatmap)
  }
}
#my.plotHMatrices(rna.norm.nmf.exp, heat.anno)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                      Recovery plots functions                              ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

auc <- function(rnk.list,max=NULL) {
  aux = sapply(rnk.list,function(rnk) {
    if (is.null(max)) {max = max(rnk)} 
    rnk = sort(rnk)
    X = 0
    i = 1
    ngenes = length(rnk)
    while ((rnk[i] <= max) && (i <= length(rnk))) {X = X + max -rnk[i];i = i+1}
    rauc = X/(i-1)/max
    return(rauc)
  })
  return(aux)
}

roc <- function(rnk.list,max=NULL,title=NULL) {
  require(RColorBrewer)
  col = brewer.pal(length(rnk.list),'Set1')
  rnk = c(1,rnk.list[[1]])
  if (is.null(max)) {max = max(rnk)} else {rnk=c(rnk,max)}
  plot(rnk,(1:length(rnk))/length(rnk),type='s',col=col[1],lwd=3,main=title,ylab='',xlab='Ranks')
  for (i in 2:length(rnk.list)) {
    rnk = c(1,rnk.list[[i]])
    if (is.null(max)) {max = max(rnk)} else {rnk=c(rnk,max)}
    lines(rnk,(1:length(rnk))/length(rnk),type='s',col=col[i],lwd=3)
  }
  L = length(rnk.list[[1]])
  abline(1/L,(1-1/L)/(max),lty=2,lwd=2,col='darkgrey')
  legend('bottomright',legend = names(rnk.list),col=col,lwd=3)
}

recovery_plot <- function(h, annot, annotID){
  which.a = annotID
  annot.factor <- annot[,annotID]
  
  n.samples = nrow(annot)
  
  ALL.RNKS = lapply(levels(annot.factor),function(l) {
  RNKS=lapply(1:nrow(h),function(i) {
    exp = sort(h[i,],decreasing=TRUE)
    i.rnk = match(rownames(annot)[annot.factor==l],names(exp))
    i.rnk = sort(i.rnk[!is.na(i.rnk)])
    return(i.rnk)
  })
  names(RNKS) = paste0('Sig ',1:length(RNKS))
  return(RNKS)
  })
    names(ALL.RNKS) = levels(annot.factor)
    
    AUC.RAND = lapply(ALL.RNKS,function(r) {
    do.call('rbind',lapply(r, function(x) {
      ##
      l = lapply(1:500,function(i) {
        sample(1:n.samples,length(x))
      })
      aux = auc(l,max=n.samples)
      return(c(mean(aux),sd(aux)))
    }))
      })
  
  AUC = lapply(ALL.RNKS,auc,max=n.samples)
  
  
  PVAL = lapply(1:length(AUC),function(i) {
    x = data.frame(AUC.RAND[[i]],AUC[[i]])
    colnames(x) = c('mean','sd','val')
    z = (x[,3]-x[,1])/x[,2]
    p = ifelse(z>0,pnorm(z,lower.tail=FALSE),pnorm(z))
    x$z = z
    x$p = p
    return(x)
  })
  names(PVAL) = names(AUC)
  
  
  for (n in names(ALL.RNKS)) {
    cat("\n")
    cat("  \n##### ",  n, "  \n  ")
    #print(n)
    RNKS = ALL.RNKS[[n]]
    names(RNKS) = paste0(names(RNKS),' - Pval = ',sprintf('%.1e',PVAL[[n]][,5]))
    roc(RNKS,max=n.samples,title=paste0(annotID,' - level : ',n))
    
  }
}


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                    Plot H matrix heatmap integrative NMF                   ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##


plotH_oneView <- function(int_nmf, viewID, k, heat.anno, col=viridis(100), 
                          scale_color=TRUE, displayID = viewID){
  # Find which iteration returned the best results
  
  k <- paste0("k", k)
  top_idx <- int_nmf@best_factorization_idx
  idx <- top_idx[match(k, names(top_idx))]
  
  sharedH <- int_nmf@shared_HMatrix_list[[k]][[idx]]
  Hview   <- int_nmf@view_specific_HMatrix_list[[k]][[idx]][[viewID]]
  
  # Define total H matrix
  totalH <- sharedH + Hview
  # Color Function
  if (scale_color) {
    colf <- circlize::colorRamp2(seq(0, max(totalH), length.out = 100), col)
  } else {
    colf <- col
  }
  
  main_hist <- hclust(as.dist(1 - cor(totalH, method = "pearson")))
  
  tH.heatmap <- Heatmap(totalH,
                        col = colf,
                        name = "Total Exposure",
                        column_title = "Total H matrix",
                        cluster_columns = main_hist,
                        show_column_dend = TRUE,
                        top_annotation = heat.anno, 
                        show_column_names = FALSE,
                        show_row_names = FALSE,
                        cluster_rows = FALSE)
  
  sH.heatmap <- Heatmap(sharedH,
                        col = colf,
                        name = "Shared Exposure",
                        column_title = "Shared H matrix",
                        cluster_columns = main_hist,
                        show_column_dend = TRUE,
                        top_annotation = heat.anno,
                        show_column_names = FALSE,
                        show_row_names = FALSE,
                        cluster_rows = FALSE)
  
  vH.heatmap <- Heatmap(Hview,
                        col = colf,
                        name = "View Specific Exposure",
                        column_title = "View specific H matrix",
                        cluster_columns = main_hist,
                        show_column_dend = TRUE,
                        top_annotation = heat.anno,
                        show_column_names = FALSE,
                        show_row_names = FALSE,
                        cluster_rows = FALSE)
  #print(tH.heatmap + sH.heatmap + vH.heatmap)
  
  
  ht_global_opt(heatmap_legend_grid_height = unit(.25, "cm"))
  ht_list <- tH.heatmap + sH.heatmap + vH.heatmap
  draw(ht_list, row_title = displayID)
  ht_global_opt(RESET = TRUE)
}

Gene expression (RNAseq)

Data loading

Read normalized gene expression matrix…

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                         Read normalized data                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
rna.norm.mat <- readRDS("data/rnaseq/rnaseq_normalized_counts.RDS")
rna.annotation <- readRDS("data/rnaseq/rnaseq_annotation.RDS")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Print dataset dimension                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

cat("Dimension of transcriptome dataset (RNAseq):  \n\n  ") 

Dimension of transcriptome dataset (RNAseq):

kable(data.frame(dim(rna.norm.mat), row.names = c("features", "samples")), 
      col.names = "") 
features 21811
samples 45

Applying NMF

Applying Non-Negative Matrix Factorization (NMF) to normalized transcriptome data (RNAseq)

Factorization parameters:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Parameters to run NMF in GPUs using  pythonCuda               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 5
k.max <- 10
outer.iter <- 10
inner.iter <- 2*10^4

kable(data.frame(c(k.min, k.max, outer.iter, inner.iter), 
                 row.names = c("k.min", "k.max", "outer.iter", "inner.iter")), 
      col.names = "") 
k.min 5
k.max 10
outer.iter 10
inner.iter 20000
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Create nmf experiment object and run NMF                      ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.nmf.exp <- nmfExperimentFromMatrix(matrix = rna.norm.mat)


rna.nmf.exp <- runNMFtensor(rna.nmf.exp, 
                            k.min = k.min, 
                            k.max = k.max, 
                            outer.iter = outer.iter, 
                            inner.iter = inner.iter, 
                            conver.test.stop.threshold = 40)
## [1] "2019-04-03 02:06:15 UTC"
## Factorization rank:  5 
##  Iteration:  10 
## [1] "NMF converged after  268,130,257,156,179,90,134,114,202,171 iterations"
## [1] "2019-04-03 02:06:24 UTC"
## Factorization rank:  6 
##  Iteration:  10 
## [1] "NMF converged after  169,167,362,224,224,121,170,161,218,136 iterations"
## [1] "2019-04-03 02:06:34 UTC"
## Factorization rank:  7 
##  Iteration:  10 
## [1] "NMF converged after  193,169,75,130,162,305,116,175,228,233 iterations"
## [1] "2019-04-03 02:06:44 UTC"
## Factorization rank:  8 
##  Iteration:  10 
## [1] "NMF converged after  157,264,134,191,207,158,275,211,371,198 iterations"
## [1] "2019-04-03 02:06:55 UTC"
## Factorization rank:  9 
##  Iteration:  10 
## [1] "NMF converged after  131,226,196,137,189,126,182,242,145,155 iterations"
## [1] "2019-04-03 02:07:06 UTC"
## Factorization rank:  10 
##  Iteration:  10 
## [1] "NMF converged after  217,308,235,200,104,111,141,180,212,263 iterations"

The NMF has now run, and we can have a look at the W and H matrices for k=9 for example:

w = WMatrix(rna.nmf.exp,k=9)
pheatmap(w[1:50,],cluster_rows = F,cluster_cols = F)

h = HMatrix(rna.nmf.exp,k=9)
pheatmap(h,cluster_rows = F,cluster_cols = F)

We now want to normalize the columns of W to 1:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                               Normalize NMF                                ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.norm.nmf.exp <- normalizeW(rna.nmf.exp)

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                      K stats and normalization                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
rna.norm.nmf.exp <- my.kstats(rna.norm.nmf.exp)

Factorization quality metrics and optimal K

Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Plot K stats                                    ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
gg.optKr <- my.plotKstats(rna.norm.nmf.exp, "NMF factorization quality metrics")
gg.optKr

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        Generate river plot                                 ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
river <- generateRiverplot(rna.norm.nmf.exp)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.

H Matrix, W normalized:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        H matrix heatmap annotation                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(rna.annotation$color[match(levels(rna.annotation$Celltype), rna.annotation$Celltype)],
         levels(rna.annotation$Celltype))
)

# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = rna.annotation$Celltype),
                               col = type.colVector,
                               show_annotation_name = TRUE, na_col = "white")


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix heatmap, W normalized                       ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

for(ki in names(rna.norm.nmf.exp@HMatrixList)) {
  cat("\n")
  cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  #plot H matrix
  tmp.hmatrix <- HMatrix(rna.norm.nmf.exp, k = ki)
  colnames(tmp.hmatrix) <- colnames(rna.norm.nmf.exp)
  h.heatmap <- Heatmap(tmp.hmatrix,
                       col = viridis(100),
                       name = "Exposure",
                       clustering_distance_columns = 'pearson',
                       show_column_dend = TRUE,
                       heatmap_legend_param = 
                         list(color_bar = "continuous", legend_height=unit(2, "cm")),
                       top_annotation = heat.anno,
                       show_column_names = FALSE,
                       show_row_names = FALSE,
                       cluster_rows = FALSE)
  print(h.heatmap)
  
  cat("  \n Recovery plots for k=",  ki, "  \n  ")
  
  recovery_plot(tmp.hmatrix, rna.annotation, "Celltype")
  
  }

Gene set enrichment analysis

Using the feature exposure extracted from the W matrix, a gene set enrichment analysis is perform agains all MSigDB terms

The optimal factorization rank selected was: K = 9

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                              W matrix Z scores                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.wmatrix <- WMatrix(rna.norm.nmf.exp, k = 9)
rownames(rna.wmatrix) <- rownames(rna.norm.nmf.exp)

#Zscore for each signature
rna.wmatrix.zscores <- apply(rna.wmatrix, MARGIN=2, function(wmat_score){
  (wmat_score - median(wmat_score)) / mad(wmat_score)
})
colnames(rna.wmatrix.zscores) <- paste0("Zscore_Sign", 1:9)


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##         GAGE (Generally Applicable Gene-set Enrichment) analysis           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Infer gene sets tha are significantly pertubed relative to all genes considered
#load precompiled GSEA MSigDB gene sets
gs.msigDB <- readList("db/msigdb.v6.2.symbols.gmt")
# head(gs.msigDB)

#run GAGE analysis
rna.msigDB.enrichment <- gage(rna.wmatrix.zscores, gsets=gs.msigDB, same.dir=TRUE)

#Drop NAs for upregulated
rna.msigDB.enrichment <- as.data.frame(rna.msigDB.enrichment$greater)
rna.msigDB.enrichment <- rna.msigDB.enrichment[!is.na(rna.msigDB.enrichment$p.geomean),]
rna.msigDB.enrichment <- rna.msigDB.enrichment[, paste0("Zscore_Sign", 1:9)]

# Select only more enriched terms in one signature compared to the others
idx <- apply(rna.msigDB.enrichment, 1, function(term){
  term <- -log10(term)
  # Change 0 to small value to avoid NAs
  term[term == 0] <- 1e-40
  # find if this term is more enriched in one signature compared to others
  is.enrich <- sapply(term, function(x){
    # p-value 5 times greater than at least 5 other signatures
    sum(x/term > 5) > 5
  })
  any(is.enrich)
})

rna.msigDB.enrichment <- rna.msigDB.enrichment[idx,]

# Print table
datatable(rna.msigDB.enrichment, filter="top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = list(list(extend = 'collection',
                                             buttons = c('excel', 'csv'),
                                             text = 'DOWNLOAD DATA')))) %>%
  formatSignif(columns=colnames(rna.msigDB.enrichment), digits=3)

Chromatin accessibility (ATACseq)

Data loading

Read normalized chromatin accessibility matrix…

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                         Read normalized data                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
atac.norm.mat <- readRDS("data/atacseq/atacseq_normalized_counts.RDS")
atac.annotation <- readRDS("data/atacseq/atacseq_annotation.RDS")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Print dataset dimension                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

cat("Dimension of Chromatin accessibility dataset (ATACseq):  \n\n  ") 

Dimension of Chromatin accessibility dataset (ATACseq):

kable(data.frame(dim(atac.norm.mat), row.names = c("features", "samples")), 
      col.names = "") 
features 123102
samples 68

Applying NMF

Applying Non-Negative Matrix Factorization (NMF) to normalized Chromatin accessibility data (ATACseq)

Factorization parameters:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Parameters to run NMF in GPUs using  pythonCuda               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 5
inner.iter <- 2*10^4

kable(data.frame(c(k.min, k.max, outer.iter, inner.iter), 
                 row.names = c("k.min", "k.max", "outer.iter", "inner.iter")), 
      col.names = "") 
k.min 8
k.max 10
outer.iter 5
inner.iter 20000
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Create nmf experiment object and run NMF                      ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
atac.nmf.exp <- nmfExperimentFromMatrix(matrix = atac.norm.mat)


atac.nmf.exp <- runNMFtensor(atac.nmf.exp, 
                            k.min = k.min, 
                            k.max = k.max, 
                            outer.iter = outer.iter, 
                            inner.iter = inner.iter, 
                            conver.test.stop.threshold = 40)
## [1] "2019-04-03 02:10:25 UTC"
## Factorization rank:  8 
## [1] "NMF converged after  107,285,289,349,457 iterations"
## [1] "2019-04-03 02:11:04 UTC"
## Factorization rank:  9 
## [1] "NMF converged after  181,311,158,445,228 iterations"
## [1] "2019-04-03 02:11:39 UTC"
## Factorization rank:  10 
## [1] "NMF converged after  440,167,201,127,358 iterations"
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                               Normalize NMF                                ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
atac.norm.nmf.exp <- normalizeW(atac.nmf.exp)

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                      K stats and normalization                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
atac.norm.nmf.exp <- my.kstats(atac.norm.nmf.exp)

Factorization quality metrics and optimal K

Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Plot K stats                                    ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
gg.optKr <- my.plotKstats(atac.norm.nmf.exp, "NMF factorization quality metrics")
gg.optKr

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        Generate river plot                                 ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
river <- generateRiverplot(atac.norm.nmf.exp)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.

H Matrix, W normalized:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        H matrix heatmap annotation                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(atac.annotation$color[match(levels(atac.annotation$Celltype), atac.annotation$Celltype)],
         levels(atac.annotation$Celltype))
)

# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = atac.annotation$Celltype),
                               col = type.colVector,
                               show_annotation_name = TRUE, na_col = "white")


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix heatmap, W normalized                       ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in names(atac.norm.nmf.exp@HMatrixList)) {
  cat("\n")
  cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  #plot H matrix
  tmp.hmatrix <- HMatrix(atac.norm.nmf.exp, k = ki)
  colnames(tmp.hmatrix) <- colnames(atac.norm.nmf.exp)
  h.heatmap <- Heatmap(tmp.hmatrix,
                       col = viridis(100),
                       name = "Exposure",
                       clustering_distance_columns = 'pearson',
                       show_column_dend = TRUE,
                       heatmap_legend_param = 
                         list(color_bar = "continuous", legend_height=unit(2, "cm")),
                       top_annotation = heat.anno,
                       show_column_names = FALSE,
                       show_row_names = FALSE,
                       cluster_rows = FALSE)
  print(h.heatmap)
  
  cat("  \n Recovery plots for k=",  ki, "  \n  ")
  
  recovery_plot(tmp.hmatrix, atac.annotation, "Celltype")
  
  }


Recovery plots for k= 8


Recovery plots for k= 9


Recovery plots for k= 10

gc()
       used  (Mb) gc trigger   (Mb)  max used   (Mb)

Ncells 11148476 595.4 18697022 998.6 18697022 998.6 Vcells 87211681 665.4 160715715 1226.2 160668526 1225.9

Integrative RNAseq & ATACseq

Gene expression (RNAseq) & Chromatin accessibility (ATACseq)

Only the those samples with RNAseq and ATACseq data were used in the integrative analysis.

Data loading

Read normalized gene expression matrix and chromatin accessibility matrix…

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                         Read normalized data                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
int.norm.mat <- readRDS("data/multiview/multiview_norm_mat_list.RDS")
int.annotation <- readRDS("data/multiview/multiview_annotation.RDS")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Print dataset dimension                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
cat("Dimension of transcriptome dataset (RNAseq):  \n\n  ") 

Dimension of transcriptome dataset (RNAseq):

kable(data.frame(dim(int.norm.mat$rna), row.names = c("features", "samples")), 
      col.names = "") 
features 21811
samples 24
cat("Dimension of Chromatin accessibility dataset (ATACseq):  \n\n  ") 

Dimension of Chromatin accessibility dataset (ATACseq):

kable(data.frame(dim(int.norm.mat$atac), row.names = c("features", "samples")), 
      col.names = "") 
features 123102
samples 24

Applying integrative NMF

Applying Integrative Non-Negative Matrix Factorization (NMF) to normalized Gene expression (RNAseq) and Chromatin accessibility data (ATACseq)

Factorization parameters:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Parameters to run NMF in GPUs using  pythonCuda               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 5
inner.iter <- 2*10^4

kable(data.frame(c(k.min, k.max, outer.iter, inner.iter), 
                 row.names = c("k.min", "k.max", "outer.iter", "inner.iter")), 
      col.names = "") 
k.min 8
k.max 10
outer.iter 5
inner.iter 20000
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Create nmf experiment object and run NMF                      ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
int.nmf.exp <- run_integrative_NMF_tensor(int.norm.mat, 
                                          k_min = k.min, 
                                          k_max = k.max, 
                                          outer_iter = outer.iter, 
                                          inner_iter = inner.iter, 
                                          conver_stop_threshold = 40, 
                                          lambda=0.7)
## Running integrative NMF for views:  rna,atac 
## [1] "2019-04-03 02:13:30 UTC"
## Factorization rank:  8 
## [1] "NMF converged after  145,118,178,229,211 iterations"
## [1] "2019-04-03 02:13:50 UTC"
## Factorization rank:  9 
## [1] "NMF converged after  215,196,95,154,179 iterations"
## [1] "2019-04-03 02:14:11 UTC"
## Factorization rank:  10 
## [1] "NMF converged after  371,340,136,251,270 iterations"

Factorization quality metrics and optimal K

Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                RNAseq - Plot K stats - River plot                          ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

## Plot K stats
gg.optKr <- my.plotKstats(int.nmf.exp@view_specific_NMFexp_list$rna, "RNAseq factorization quality metrics")
gg.optKr

## Generate river plot
river <- generateRiverplot(int.nmf.exp@view_specific_NMFexp_list$rna)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                ATACseq - Plot K stats - River plot                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

## Plot K stats
gg.optKr <- my.plotKstats(int.nmf.exp@view_specific_NMFexp_list$atac, "ATACseq factorization quality metrics")
gg.optKr

## Generate river plot
river <- generateRiverplot(int.nmf.exp@view_specific_NMFexp_list$atac)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.

H Matrices:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        H matrix heatmap annotation                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(int.annotation$color[match(levels(int.annotation$Celltype), int.annotation$Celltype)],
         levels(int.annotation$Celltype))
)

# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = int.annotation$Celltype),
                               col = type.colVector,
                               show_annotation_name = FALSE, na_col = "white")


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix heatmap, W normalized                       ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in k.min:k.max) {
  cat("\n")
  #cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  cat("  \n#### H matrix for k=",  ki, "  {.tabset}    \n  ")
  #plot H matrix
  cat("\n")
  cat("  \n##### not scaled  \n  ")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = FALSE, viewID = "rna", displayID = "RNAseq")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = FALSE, viewID = "atac", displayID = "ATACseq")
  
  cat("\n")
  cat("  \n##### scaled  \n  ")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = TRUE, viewID = "rna", displayID = "RNAseq")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = TRUE, viewID = "atac", displayID = "ATACseq")
  
  }

Gene set enrichment analysis

Using the feature exposure extracted from the W matrix, a gene set enrichment analysis is perform agains all MSigDB terms

The optimal factorization rank selected was: K = 9

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                              W matrix Z scores                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
idx <- int.nmf.exp@best_factorization_idx
idx <- idx[names(idx) == "k9"]

int.rna.wmatrix <- int.nmf.exp@view_specific_WMatrix_list$k9[[idx]]$rna
rownames(int.rna.wmatrix) <- rownames(int.norm.mat$rna)

#Zscore for each signature
int.rna.wmatrix.zscores <- apply(int.rna.wmatrix, MARGIN=2, function(wmat_score){
  (wmat_score - median(wmat_score)) / mad(wmat_score)
})
colnames(int.rna.wmatrix.zscores) <- paste0("Zscore_Sign", 1:9)


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##         GAGE (Generally Applicable Gene-set Enrichment) analysis           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Infer gene sets tha are significantly pertubed relative to all genes considered
#load precompiled GSEA MSigDB gene sets
gs.msigDB <- readList("db/msigdb.v6.2.symbols.gmt")
# head(gs.msigDB)

#run GAGE analysis
int.rna.msigDB.enrichment <- gage(int.rna.wmatrix.zscores, gsets=gs.msigDB, same.dir=TRUE)

#Drop NAs for upregulated
int.rna.msigDB.enrichment <- as.data.frame(int.rna.msigDB.enrichment$greater)
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[!is.na(int.rna.msigDB.enrichment$p.geomean),]
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[, paste0("Zscore_Sign", 1:9)]

# Select only more enriched terms in one signature compared to the others
idx <- apply(int.rna.msigDB.enrichment, 1, function(term){
  term <- -log10(term)
  # Change 0 to small value to avoid NAs
  term[term == 0] <- 1e-40
  # find if this term is more enriched in one signature compared to others
  is.enrich <- sapply(term, function(x){
    # p-value 5 times greater than at least 5 other signatures
    sum(x/term > 5) > 5
  })
  any(is.enrich)
})

int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[idx,]

# Print table
datatable(int.rna.msigDB.enrichment, filter="top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = list(list(extend = 'collection',
                                             buttons = c('excel', 'csv'),
                                             text = 'DOWNLOAD DATA')))) %>%
  formatSignif(columns=colnames(int.rna.msigDB.enrichment), digits=3)

iNMF Recovery plots:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix recovery plots                              ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rownames(int.annotation) <- int.annotation$sampleID
int.annotation$Celltype <- factor(int.annotation$Celltype, levels = unique(int.annotation$Celltype))

for(ki in k.min:k.max) {
  cat("\n")
  #cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  cat("  \n### Recovery plots for k=",  ki, "   {.tabset}      \n  ")
  #plot H matrix
  
  # Find best shared matrix, according to factorization metrics
  k <- paste0("k", ki)
  top_idx <- int.nmf.exp@best_factorization_idx
  idx <- top_idx[match(k, names(top_idx))]
  
  cat("\n")
  cat("  \n#### Shared H Matrix      \n  ")
  sharedH <- int.nmf.exp@shared_HMatrix_list[[k]][[idx]]
  colnames(sharedH) <- int.annotation$sampleID
  # Make recovery plots
  recovery_plot(sharedH, int.annotation, "Celltype")
  
  cat("\n")
  cat("  \n#### RNAseq H Matrix      \n  ")
  viewH <- int.nmf.exp@view_specific_HMatrix_list[[k]][[idx]]$rna
  colnames(viewH) <- int.annotation$sampleID
  # Make recovery plots
  recovery_plot(viewH, int.annotation, "Celltype")
  
  cat("\n")
  cat("  \n#### ATACseq H Matrix      \n  ")
  viewH <- int.nmf.exp@view_specific_HMatrix_list[[k]][[idx]]$atac
  colnames(viewH) <- int.annotation$sampleID
  # Make recovery plots
  recovery_plot(viewH, int.annotation, "Celltype")
  
  }
HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP